import pandas as pd
import numpy as np
import requests
from dotenv import load_dotenv, dotenv_values
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_recall_fscore_support
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegressionCV
from sklearn.dummy import DummyClassifier
import re
from unidecode import unidecode
import json
from tqdm.notebook import tqdm
import plotly.express as px
from time import sleep
pd.set_option('mode.chained_assignment',None)
load_dotenv();
tqdm.pandas()
import plotly
plotly.offline.init_notebook_mode()
tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None)
tweets.shape
(1600000, 6)
tweets.head()
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| 0 | 0 | 1467810369 | Mon Apr 06 22:19:45 PDT 2009 | NO_QUERY | _TheSpecialOne_ | @switchfoot http://twitpic.com/2y1zl - Awww, t... |
| 1 | 0 | 1467810672 | Mon Apr 06 22:19:49 PDT 2009 | NO_QUERY | scotthamilton | is upset that he can't update his Facebook by ... |
| 2 | 0 | 1467810917 | Mon Apr 06 22:19:53 PDT 2009 | NO_QUERY | mattycus | @Kenichan I dived many times for the ball. Man... |
| 3 | 0 | 1467811184 | Mon Apr 06 22:19:57 PDT 2009 | NO_QUERY | ElleCTF | my whole body feels itchy and like its on fire |
| 4 | 0 | 1467811193 | Mon Apr 06 22:19:57 PDT 2009 | NO_QUERY | Karoli | @nationwideclass no, it's not behaving at all.... |
tweets.describe()
| 0 | 1 | |
|---|---|---|
| count | 1.600000e+06 | 1.600000e+06 |
| mean | 2.000000e+00 | 1.998818e+09 |
| std | 2.000001e+00 | 1.935761e+08 |
| min | 0.000000e+00 | 1.467810e+09 |
| 25% | 0.000000e+00 | 1.956916e+09 |
| 50% | 2.000000e+00 | 2.002102e+09 |
| 75% | 4.000000e+00 | 2.177059e+09 |
| max | 4.000000e+00 | 2.329206e+09 |
# extract relevant features (text and associated sentiment)
tweets = tweets[[5,0]]
tweets.columns = (['text', 'sentiment'])
tweets
| text | sentiment | |
|---|---|---|
| 0 | @switchfoot http://twitpic.com/2y1zl - Awww, t... | 0 |
| 1 | is upset that he can't update his Facebook by ... | 0 |
| 2 | @Kenichan I dived many times for the ball. Man... | 0 |
| 3 | my whole body feels itchy and like its on fire | 0 |
| 4 | @nationwideclass no, it's not behaving at all.... | 0 |
| ... | ... | ... |
| 1599995 | Just woke up. Having no school is the best fee... | 4 |
| 1599996 | TheWDB.com - Very cool to hear old Walt interv... | 4 |
| 1599997 | Are you ready for your MoJo Makeover? Ask me f... | 4 |
| 1599998 | Happy 38th Birthday to my boo of alll time!!! ... | 4 |
| 1599999 | happy #charitytuesday @theNSPCC @SparksCharity... | 4 |
1600000 rows × 2 columns
set(tweets['sentiment'])
{0, 4}
We'll only have positive or negative sentiments for these tweets.
sample, _ = train_test_split(tweets, train_size=100, stratify=tweets['sentiment'])
sample.describe()
| sentiment | |
|---|---|
| count | 100.000000 |
| mean | 2.000000 |
| std | 2.010076 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 2.000000 |
| 75% | 4.000000 |
| max | 4.000000 |
for text in sample.text[:20]:
print(text, '\n')
@mattchewww i forgot to set my tivo before i left home im miss rove Jus left MyMy goin 2 thEE house... shaheen = same song @ArunBasilLal All geeks say that :-P @gerryc OH NO! Gerry's tweets are all Español now @temptationdice the english is gaaaaay life sucks rigth now. this kinda of feeling is so strange life seems so short when these thing happen :x ....idk its just a thought Lovin' it!! CL!! @Po3try maybe she has no hair..... jus hate c n otha people sad n dwn. makes me feel bad. had 2 tell sum 1 da truth 2day. now they sad puppy face n all. im like daaaamn son he won't answer me, and it makes me want to cryyy @bookdepository just sent mine..really want that notebook g'mornin all...dont wanna go into work 2day, I think i might still be sick, which isn't pretty after last week BTW END THE FED HR1207! @caitt2104 I love you too @vyzion360 Good Morning to you! Thanks for sharing with us, it's grey and rainy and cold where I am today @scrapchick Thanks! Where are you, anyway? My best friend in Madison had a Dateline viewing party in mah honor. She's prolly drunk now. I'm going to miss you guyss sooo muccchhhh!!!! Keep singing! and as for 'A' class, we see you next month! Love you guys! loaded coronas courtesty of @Niero for @jraquino and me!! @AmberPacific ok well stop cuz ill always be ur virtual little sis *gives innocent look* Driving back home to dothan! Work this afternoon and ethics assignment Bleh
def _clean(text):
"""
Remove twitter users names, urls, multiple spaces.
"""
result = re.sub('@[A-Za-z0-9_]+','', text)
result = re.sub(r'http\S+', '', result)
result = ' '.join(result.split())
return unidecode(result)
def _create_data_for(*content):
"""
Return formatted content for API call.
"""
all_items = ['{ id: "%i", text: "%s"}'% (n+1, text) for n, text in enumerate(content) if text]
data = '{ documents: [' + ', '.join(all_items) + ']}'
return data
def _query_api(data):
"""
Request AZURE API for data and return result.
"""
endpoint = 'https://text-analytics-base.cognitiveservices.azure.com/text/analytics/v3.2-preview.1/sentiment?opinionMining=true'
api_key = dotenv_values()['AZURE_TEXT_ANALYTICS_KEY']
headers = {
'Content-Type': 'application/json',
"Ocp-Apim-Subscription-Key": api_key
}
result = requests.post(url=endpoint, headers=headers, data=data)
return result
def _extract_sentiment(result):
"""
Extract value of all keys "sentiment" in result['documents'].
"""
return tuple([item['sentiment'] for item in result.json()['documents']])
def get_sentiment_for(*content):
"""
Return a sentiment (positive, negative, mixed or neutral) for each string argument.
Result is returned as a tuple, same length as content.
"""
cleaned_content = [_clean(text) for text in content]
batches_heads = [i for i in range(len(cleaned_content))][::10]
all_sentiments = []
for head in tqdm(batches_heads):
batch = cleaned_content[head: head+10]
data = _create_data_for(*batch)
result = _query_api(data)
sentiments = _extract_sentiment(result)
all_sentiments += [*sentiments]
return all_sentiments
result = get_sentiment_for(*list(sample.text))
sample['computed_sentiment'] = result
sample
| text | sentiment | computed_sentiment | |
|---|---|---|---|
| 1004214 | I follow @Bluenscottish because she finds the ... | 4 | negative |
| 576040 | RIP Hope the Kitty She is missed. | 0 | negative |
| 1530426 | I-announce ko kaya ang Twitter username ko sa ... | 4 | neutral |
| 883453 | @CentCaps Heading to a place where I can sleep... | 4 | positive |
| 890352 | Eating Chinese food with my momma Yuuuuum. ... | 4 | positive |
| ... | ... | ... | ... |
| 1249884 | Come see me AND sing To me | 4 | positive |
| 397312 | Stormy skies on motorway just south of bristol... | 0 | neutral |
| 1428378 | It's hockey time! Then hittin' the town!! | 4 | neutral |
| 128892 | just woke up. now i gotta go to school | 0 | positive |
| 419912 | Watching SLC Punk!, and not looking forward to... | 0 | negative |
100 rows × 3 columns
The possible true values for a sentiment in our dataset are 0 (negative) and 4 (positive), while for the Azure API there are 4 possible values : negative, neutral, positive and mixed. For homogeneity we'll assimilate neutral and mixed to score 2.
sentiments_dict = {"negative": 0, "neutral": 2, "mixed": 2, "positive": 4 }
sample['computed_score'] = sample['computed_sentiment'].apply(lambda sent: sentiments_dict.get(sent))
sample
| text | sentiment | computed_sentiment | computed_score | |
|---|---|---|---|---|
| 1004214 | I follow @Bluenscottish because she finds the ... | 4 | negative | 0 |
| 576040 | RIP Hope the Kitty She is missed. | 0 | negative | 0 |
| 1530426 | I-announce ko kaya ang Twitter username ko sa ... | 4 | neutral | 2 |
| 883453 | @CentCaps Heading to a place where I can sleep... | 4 | positive | 4 |
| 890352 | Eating Chinese food with my momma Yuuuuum. ... | 4 | positive | 4 |
| ... | ... | ... | ... | ... |
| 1249884 | Come see me AND sing To me | 4 | positive | 4 |
| 397312 | Stormy skies on motorway just south of bristol... | 0 | neutral | 2 |
| 1428378 | It's hockey time! Then hittin' the town!! | 4 | neutral | 2 |
| 128892 | just woke up. now i gotta go to school | 0 | positive | 4 |
| 419912 | Watching SLC Punk!, and not looking forward to... | 0 | negative | 0 |
100 rows × 4 columns
ConfusionMatrixDisplay.from_predictions(sample['sentiment'], sample['computed_score']);
This is not a good approach since we cannot compare the API results with the ground truth values. How can we convert the neutral / mixed values to positive / negative ones? Let's have a closer look to the API outputs on a sample text:
data = _create_data_for('This was a trip... I loved it. I hated it.')
result = _query_api(data)
result.json()
{'documents': [{'id': '1',
'sentiment': 'mixed',
'confidenceScores': {'positive': 0.5, 'neutral': 0.0, 'negative': 0.5},
'sentences': [{'sentiment': 'neutral',
'confidenceScores': {'positive': 0.23, 'neutral': 0.67, 'negative': 0.1},
'offset': 0,
'length': 18,
'text': 'This was a trip...',
'targets': [],
'assessments': []},
{'sentiment': 'positive',
'confidenceScores': {'positive': 1.0, 'neutral': 0.0, 'negative': 0.0},
'offset': 19,
'length': 11,
'text': 'I loved it.',
'targets': [],
'assessments': []},
{'sentiment': 'negative',
'confidenceScores': {'positive': 0.0, 'neutral': 0.0, 'negative': 1.0},
'offset': 31,
'length': 11,
'text': 'I hated it.',
'targets': [],
'assessments': []}],
'warnings': []}],
'errors': [],
'modelVersion': '2020-04-01'}
This is an extreme example but the idea is to rely on the confidence scores instead of the sentiment returned.
To do so, we are going to:
Since the API is limited by the number of processed text records, and not the number of calls to API, the function is modified to call the API for one text at a time.
sample, _ = train_test_split(tweets, train_size=2000, stratify=tweets['sentiment'])
sample
| text | sentiment | |
|---|---|---|
| 115011 | Where's my sunshine this morning? Is June glo... | 0 |
| 174461 | � miss my future, � wanna see her � for damn ... | 0 |
| 1585470 | @_hayles New! But wayyy behind, only on episo... | 4 |
| 1041463 | @koshian your book was taken by @nazroll and h... | 4 |
| 133716 | @AshantiD Wass good shanti, I gets no love?!?! | 0 |
| ... | ... | ... |
| 448461 | @Cupcakeqween i texted u no reply? | 0 |
| 229705 | Back home from fishing. My wife caught fish an... | 0 |
| 1148283 | My brain exploded and grew back. Whatever they... | 4 |
| 1213820 | Goin maths | 4 |
| 412487 | See? Im pissed that im missing Paramore when t... | 0 |
2000 rows × 2 columns
def _extract_scores(result):
"""
Return values of key 'confidenceScores' in result (json answer from API.)
"""
try:
scores = result.json()['documents'][0]['confidenceScores']
return scores
except (KeyError, IndexError) as e:
print(result.json())
def get_scores_for(text):
"""
Return dict of confidenceScores on sentiment for text.
"""
cleaned_text = _clean(text)
data = _create_data_for(cleaned_text)
result = _query_api(data)
return _extract_scores(result)
if not 'scores_dict' in globals():
scores_dict = {}
start = len(scores_dict)
for row in tqdm(sample[start:].itertuples(), total=sample[start:].shape[0]):
sleep(0.5)
index, text, *_ = row
scores = get_scores_for(text)
if scores:
scores_dict.update({index: scores})
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'error': {'code': 'InvalidRequest', 'message': 'Invalid Request.', 'innererror': {'code': 'EmptyRequest', 'message': 'Request body must be present.'}}}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
{'documents': [], 'errors': [{'id': '1', 'error': {'code': 'InvalidArgument', 'message': 'Invalid document in request.', 'innererror': {'code': 'InvalidDocument', 'message': 'Document text is empty.'}}}], 'modelVersion': '2020-04-01'}
# append scores to sample dataframe
sample = sample.assign(**{'positive':np.nan, 'neutral': np.nan, 'negative': np.nan})
for i in sample.index:
scores = scores_dict.get(i, None)
if scores:
for col in ['positive', 'neutral', 'negative']:
sample.loc[i, col] = scores[col]
sample
| text | sentiment | positive | neutral | negative | |
|---|---|---|---|---|---|
| 115011 | Where's my sunshine this morning? Is June glo... | 0 | 0.43 | 0.07 | 0.50 |
| 174461 | � miss my future, � wanna see her � for damn ... | 0 | 0.28 | 0.00 | 0.72 |
| 1585470 | @_hayles New! But wayyy behind, only on episo... | 4 | 0.49 | 0.26 | 0.25 |
| 1041463 | @koshian your book was taken by @nazroll and h... | 4 | 0.14 | 0.83 | 0.03 |
| 133716 | @AshantiD Wass good shanti, I gets no love?!?! | 0 | 0.93 | 0.00 | 0.07 |
| ... | ... | ... | ... | ... | ... |
| 448461 | @Cupcakeqween i texted u no reply? | 0 | 0.03 | 0.43 | 0.54 |
| 229705 | Back home from fishing. My wife caught fish an... | 0 | 0.22 | 0.75 | 0.03 |
| 1148283 | My brain exploded and grew back. Whatever they... | 4 | 0.07 | 0.17 | 0.76 |
| 1213820 | Goin maths | 4 | 0.54 | 0.42 | 0.04 |
| 412487 | See? Im pissed that im missing Paramore when t... | 0 | 0.00 | 0.00 | 1.00 |
2000 rows × 5 columns
sample = sample.drop(sample[sample.positive.isna()].index)
sample.to_csv('sample_with_api_results.csv', sep='\t')
sample = pd.read_csv('sample_with_api_results.csv', sep='\t')
X = sample[['positive', 'neutral', 'negative']]
y = sample['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
Here we have to stop and think about our aim. If we think in terms of positive reviews and negative reviews, in our case, the false positive reviews are of greater consequence that the false negative:
In fact, what we are really interested in are the negative reviews : we want to detect "bad buzz".
So we'll here make an important decision: let's call "positive case" the case when the review is scored as negative. Why? Because all the metrics are computed on true positive, false positive, false negative, but never true negative. So if our aim is to detect real negative reviews, it makes sense to consider it as our positive class.
This leads us to consider the false positive and false negative cases in a new way:
Let's write it again so that it is perfectly clear: from now on, our positive label will be the label "0" that corresponds to negative sentiment, dissatisfaction. And our negative label will be of course the label 4.
In terms of precision and recall, this means that we are more interested in having a high recall (avoid false negative) than in having a high precision (avoid false positive).
But we cannot focus only on recall: if we consider all our customers as dissatisfied, we may end up trying to fix an imaginary dissatisfaction, that may lead us to useless costs (proposing vouchers to the customer for instance) or creating a "ridiculous buzz" on social networks by apologizing on good reviews.
To make a tradeoff between the precision and recall, we'll use the $F\beta$ score to put more weight on false negatives:
$F\beta = \frac{(1+\beta^2) TP}{(1+\beta^2)TP + \beta^2FN + FP}$
So here we'll use $\beta=2$ to focus on false negative.
class ResultManager:
def __init__(self, beta=2, pos_label=0):
self.recap = pd.DataFrame(columns=['precision', 'recall', 'Fbeta'])
self.beta = beta
self.pos_label = pos_label
def _compute_results(self, y_true, y_pred, name):
'''
compute precision, recall, Fbeta_score ,
for input values and add them in self.recap.
'''
precision, recall, Fbeta, _ = [item[0] for item in
precision_recall_fscore_support(
y_true, y_pred,
beta=self.beta,
labels=[self.pos_label])]
if name not in self.recap.index.values:
row = pd.DataFrame([precision, recall, Fbeta]).T
row.columns = self.recap.columns
row.index = [name]
self.recap = self.recap.append(row)
return precision, recall, Fbeta
def display_results(self, y_true, y_pred, name, beta=3):
'''
Display precision, recall, Fbeta score, confusion matrix.
'''
precision, recall, Fbeta = self._compute_results(y_true, y_pred, name)
for name, metric in zip(['precision', 'recall', f'Fbeta-score (beta={self.beta})'],
[ precision, recall, Fbeta]):
print(f'{name}: ', '{:.2%}'.format(metric))
ConfusionMatrixDisplay.from_predictions(y_true, y_pred);
rm = ResultManager()
dc = DummyClassifier(strategy='stratified', random_state=42).fit(X_train, y_train)
dummy_pred = dc.predict(X_test)
rm.display_results(y_test, dummy_pred, 'dummy')
precision: 51.46% recall: 52.74% Fbeta-score (beta=2): 52.48%
y_pred = LogisticRegressionCV(random_state=42).fit(X_train, y_train).predict(X_test)
rm.display_results(y_test, y_pred, 'logistic regression')
precision: 77.03% recall: 56.72% Fbeta-score (beta=2): 59.87%
px.scatter_3d(X_test, x='positive', y='neutral', z='negative', color=y_test)
Since the data don't seem to be linearly separable, let's try other methods.
from sklearn.tree import DecisionTreeClassifier
y_pred = DecisionTreeClassifier(random_state=42).fit(X_train, y_train).predict(X_test)
rm.display_results(y_test, y_pred, 'decision tree')
precision: 68.84% recall: 68.16% Fbeta-score (beta=2): 68.30%
from sklearn.ensemble import RandomForestClassifier
y_pred = RandomForestClassifier(random_state=42).fit(X_train, y_train).predict(X_test)
rm.display_results(y_test, y_pred, 'random forest classifier')
precision: 69.95% recall: 67.16% Fbeta-score (beta=2): 67.70%
And with a Gradient Boosting Classifier?
from sklearn.ensemble import GradientBoostingClassifier
y_pred = GradientBoostingClassifier(random_state=42).fit(X_train, y_train).predict(X_test)
rm.display_results(y_test, y_pred, 'gradient boosting classifier')
precision: 71.50% recall: 68.66% Fbeta-score (beta=2): 69.21%
from sklearn.svm import SVC
y_pred = SVC(random_state=42).fit(X_train, y_train).predict(X_test)
rm.display_results(y_test, y_pred, 'SVC')
precision: 74.70% recall: 61.69% Fbeta-score (beta=2): 63.92%
rm.recap.sort_values(by='Fbeta', ascending=False)
| precision | recall | Fbeta | |
|---|---|---|---|
| gradient boosting classifier | 0.715026 | 0.686567 | 0.692076 |
| decision tree | 0.688442 | 0.681592 | 0.682951 |
| random forest classifier | 0.699482 | 0.671642 | 0.677031 |
| SVC | 0.746988 | 0.616915 | 0.639175 |
| logistic regression | 0.770270 | 0.567164 | 0.598739 |
| dummy | 0.514563 | 0.527363 | 0.524752 |
We did not fine-tune hyperparameters nor cross-validate them. But since our aim is just to have an idea of the performance we may expect from a simple model (an since the default sklearn parameters have been carefully selected to be the best in many cases), we can use our scores as benchmarks for comparison with the performances of a neural network.